##Evaluation - Four metrics are calculated using the test set - \(\text{Accuracy}=\frac{\sum{\left(\text{Actual Label} = \text{Predicted Label}\right)}}{\text{Label Count}}\) - \(\text{Recall}=\frac{\text{True Positives}}{\text{True Positives} + \text{False Negatives}}\) - \(\text{Precision}=\frac{\text{True Positives}}{\text{True Positives}+\text{False Positives}}\) - \(\text{F1}=\frac{2*(\text{Precision}*\text{Recall})}{\text{Precision}+\text{Recall}}\)
While there are a small number of threatened species with a large range, it is clear that Range is likely a strong predictor of Group status
A lower range predicts a higher likelihood of threatened or extinct grouping.
features <- data[, 1:14]
label <- data[, 15]
set.seed(42)
split <- sample.split(label, SplitRatio = 0.7)
features_train = features[split,]
features_test = features[!split,]
label_train = label[split]
label_test = label[!split]
data_train <- features_train
data_train$label <- label_train
class_counts <- table(data_train$label)
class_counts <- table(data_train$label)
print(paste("( Before )Data Category Counts: ", class_counts))data_train_AB <- data_train
data_train_AB <- data_train_AB[data_train_AB$label != '3',]
data_train_AB_resampled <- ovun.sample(label ~ ., data = data_train_AB, method = "over", N = 980, seed = 1)$data
data_train_AC <- data_train
data_train_AC <- data_train_AC[data_train_AC$label != '2',]
data_train_AC_resampled <- ovun.sample(label ~ ., data = data_train_AC, method = "over", N = 980, seed = 1)$data
data_train_AB_2 <- data_train_AB_resampled[data_train_AB_resampled$label == '2',]
data_train_AC_3 <- data_train_AC_resampled[data_train_AC_resampled$label == '3',]
data_train_1 <- data_train_AB_resampled[data_train_AB_resampled$label == '1',]
data_train_combined <- rbind(data_train_1, data_train_AB_2, data_train_AC_3)
cat("( After )Data Category Counts:\n")
print(table(data_train_combined$label))features_train <- as.data.frame(lapply(features_train, function(x) {(x-min(x))/(max(x)-min(x))}))
features_test <- as.data.frame(lapply(features_test, function(x) {(x-min(x))/(max(x)-min(x))}))
features_train_1 <- as.data.frame(lapply(features_train, function(x) {(x-min(x))/(max(x)-min(x))}))
features_test_1 <- as.data.frame(lapply(features_test, function(x) {(x-min(x))/(max(x)-min(x))}))
features_train_2 <- as.data.frame(lapply(features_train, function(x) {(x - mean(x))/sd(x)}))
features_test_2 <- as.data.frame(lapply(features_test, function(x) {(x - mean(x))/sd(x)}))
features_train_3 <- as.data.frame(lapply(features_train, function(x) {x / max(abs(x))}))
features_test_3 <- as.data.frame(lapply(features_test, function(x) {x / max(abs(x))}))
features_train_4 <- as.data.frame(lapply(features_train, function(x) {x / sum(abs(x))}))
features_test_4 <- as.data.frame(lapply(features_test, function(x) {x / sum(abs(x))}))
features_train_5 <- as.data.frame(lapply(features_train, function(x) {x / sqrt(sum(x^2))}))
features_test_5 <- as.data.frame(lapply(features_test, function(x) {x / sqrt(sum(x^2))}))model_1 <- randomForest(x = data_train_1[-ncol(data_train_combined)], y = as.factor(data_train_1$label), ntree = 2)
variable_importance_1 = importance(model_1)
pred_comb_1 <- predict(model_1, features_test_1)
accuracy <- sum(label_test == pred_comb_1) / length(label_test)
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_1)
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"], cm$byClass["Class: 2", "Sensitivity"], cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"], cm$byClass["Class: 2", "Pos Pred Value"], cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),round(precision,2),round(F1,2)),ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable) Score
Accuracy 0.93
Recall 0.87
Precision 0.89
F1 0.88
model_2 <- randomForest(x = data_train_2[-ncol(data_train_combined)], y = as.factor(data_train_2$label), ntree = 2) # nolint
variable_importance_2 = importance(model_2) # nolint
pred_comb_2 <- predict(model_2, features_test_2)
accuracy <- sum(label_test == pred_comb_2) / length(label_test)
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_2)
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"], # nolint
cm$byClass["Class: 2", "Sensitivity"], # nolint
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"], # nolint
cm$byClass["Class: 2", "Pos Pred Value"], # nolint
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision ) # nolint
printTable=matrix(c(round(accuracy,2),round(recall,2),round(precision,2),round(F1,2)),ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable) Score
Accuracy 0.33
Recall 0.48
Precision 0.37
F1 0.42
model_3 <- randomForest(x = data_train_3[-ncol(data_train_combined)], y = as.factor(data_train_3$label), ntree = 2)
variable_importance_3 = importance(model_3)
pred_comb_3 <- predict(model_3, features_test_3)
accuracy <- sum(label_test == pred_comb_3) / length(label_test)
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_3)
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),round(precision,2),round(F1,2)),ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable) Score
Accuracy 0.89
Recall 0.80
Precision 0.80
F1 0.80
model_4 <- randomForest(x = data_train_4[-ncol(data_train_combined)], y = as.factor(data_train_4$label), ntree = 2)
variable_importance_4 = importance(model_4)
pred_comb_4 <- predict(model_4, features_test_4)
accuracy <- sum(label_test == pred_comb_4) / length(label_test)
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_4)
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),round(precision,2),round(F1,2)),ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable) Score
Accuracy 0.76
Recall 0.59
Precision 0.60
F1 0.59
model_5 <- randomForest(x = data_train_5[-ncol(data_train_combined)], y = as.factor(data_train_5$label), ntree = 2)
variable_importance_5 = importance(model_5)
pred_comb_5 <- predict(model_5, features_test_5)
accuracy <- sum(label_test == pred_comb_5) / length(label_test)
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_5)
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),round(precision,2),round(F1,2)),ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable) Score
Accuracy 0.71
Recall 0.51
Precision 0.48
F1 0.49
data_train_AB <- data_train
data_train_AB <- data_train_AB[data_train_AB$label != '3',]
data_train_AB_resampled <- ovun.sample(label ~ ., data = data_train_AB, method = "over", N = 980, seed = 1)$data
data_train_AC <- data_train
data_train_AC <- data_train_AC[data_train_AC$label != '2',]
data_train_AC_resampled <- ovun.sample(label ~ ., data = data_train_AC, method = "over", N = 980, seed = 1)$data
data_train_AB_2 <- data_train_AB_resampled[data_train_AB_resampled$label == '2',]
data_train_AC_3 <- data_train_AC_resampled[data_train_AC_resampled$label == '3',]
data_train_1 <- data_train_AB_resampled[data_train_AB_resampled$label == '1',]
data_train_combined <- rbind(data_train_1, data_train_AB_2, data_train_AC_3)
cat("( After )Data Category Counts:\n")
print(table(data_train_combined$label))data_train_AB <- data_train
data_train_AB <- data_train_AB[data_train_AB$label != '3',]
data_train_AB_resampled <- ovun.sample(label ~ ., data = data_train_AB, method = "over", N = 980, seed = 1)$data
data_train_AC <- data_train
data_train_AC <- data_train_AC[data_train_AC$label != '2',]
data_train_AC_resampled <- ovun.sample(label ~ ., data = data_train_AC, method = "over", N = 980, seed = 1)$data
data_train_AB_2 <- data_train_AB_resampled[data_train_AB_resampled$label == '2',]
data_train_AC_3 <- data_train_AC_resampled[data_train_AC_resampled$label == '3',]
data_train_1 <- data_train_AB_resampled[data_train_AB_resampled$label == '1',]
data_train_combined <- rbind(data_train_1, data_train_AB_2, data_train_AC_3)
cat("( After )Data Category Counts:\n")
print(table(data_train_combined$label))